Exploring associations of miRNA expression levels across tumor types in TCGA with suggested correlates defined by the Immune Response Working Group:
Data are stored on Synapse in the folder syn6171109.
Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn7222010) and can be loaded with read_tsv().
# load sample data
mirna_sample_file <- mirna_files %>%
filter(file.id == "syn7222010") %>%
.[["file_path"]]
mirna_sample_df <- read_tsv(mirna_sample_file)
Parsed with column specification:
cols(
id = col_character(),
Disease = col_character(),
Sample_Type = col_integer(),
Protocol = col_character(),
Platform = col_character()
)
Sample Quality Annotations: syn4551248 (“merged_sample_quality_annotations.tsv”)
sample_qual_file <- synGet("syn4551248", downloadLocation = "../data/tcga/")
sample_qual_df <- sample_qual_file %>%
getFileLocation() %>%
read_tsv()
Parsed with column specification:
cols(
patient_barcode = col_character(),
aliquot_barcode = col_character(),
`cancer type` = col_character(),
platform = col_character(),
patient_annotation = col_character(),
sample_annotation = col_character(),
aliquot_annotation = col_character(),
aliquot_annotation_updated = col_character(),
AWG_excluded_because_of_pathology = col_double(),
AWG_pathology_exclusion_reason = col_character(),
Reviewed_by_EPC = col_double(),
Do_not_use = col_character()
)
remove samples based on Do_not_use=True, and remove cases with AWG_excluded_because_of_pathology=True
# samples to exclude from all datasets
exclude_samples <- sample_qual_df %>%
mutate(AWG_excluded_because_of_pathology = parse_logical(AWG_excluded_because_of_pathology),
Do_not_use = parse_logical(str_to_upper(Do_not_use))) %>%
filter(AWG_excluded_because_of_pathology | Do_not_use)
Remove samples from miRNA dataset.
mirna_sample_df <- mirna_sample_df %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
miRNA normalized, batch corrected expression values for all samples are stored as a matrix in a CSV file (Synapse ID: syn7201053) and can be loaded with read_csv().
mirna_corr_file <- "../data/mirna_correlate_data.feather"
force_read <- FALSE
if (!file.exists(mirna_corr_file) | force_read) {
# load normalized, batch-corrected expression data
mirna_norm_file <- mirna_files %>%
filter(file.id == "syn7201053") %>%
.[["file_path"]]
mirna_norm_df <- read_csv(mirna_norm_file, progress = FALSE)
mirna_corr_df <- mirna_norm_df %>%
select(one_of(c("Genes", mirna_sample_pt_df$id)))
write_feather(mirna_corr_df, mirna_corr_file)
rm(mirna_norm_df)
} else {
mirna_corr_df <- read_feather(mirna_corr_file)
}
tcga_colors <- tribble(
~Color, ~Disease,
"#ED2891", "BRCA",
"#B2509E", "GBM",
"#D49DC7", "LGG",
"#C1A72F", "ACC",
"#E8C51D", "PCPG",
"#F9ED32", "THCA",
"#CACCDB", "CHOL",
"#9EDDF9", "COAD",
"#007EB5", "ESCA",
"#104A7F", "LIHC",
"#6E7BA2", "PAAD",
"#DAF1FC", "READ",
"#00AEEF", "STAD",
"#F6B667", "CESC",
"#D97D25", "OV",
"#F89420", "UCEC",
"#FBE3C7", "UCS",
"#009444", "HNSC",
"#97D1A9", "UVM",
"#754C29", "LAML",
"#CEAC8F", "THYM",
"#3953A4", "DLBC",
"#BBD642", "SKCM",
"#00A99D", "SARC",
"#542C88", "LUAD",
"#A084BD", "LUSC",
"#D3C3E0", "MESO",
"#FAD2D9", "BLCA",
"#EA7075", "KICH",
"#ED1C24", "KIRC",
"#F8AFB3", "KIRP",
"#7E1918", "PRAD",
"#BE1E2D", "TGCT"
)
mirna_sample_df <- mirna_sample_df %>%
mutate(Disease = factor(Disease, levels = tcga_colors$Disease))
The table mirna_sample_df contains 5 columns describing the 10543 samples in the data. The Sample_Type column corresponds to TCGA sample type codes, which are defined here. The following sample types are included in the miRNA data:
mirna_sample_df %>%
group_by(Sample_Type) %>%
tally()
Based on the codes, this is the distribution of sample types:
mirna_sample_df %>%
group_by(Sample_Type) %>%
tally() %>%
mutate(Definition = case_when(
.$Sample_Type == 1 ~ "Primary Solid Tumor",
.$Sample_Type == 2 ~ "Recurrent Solid Tumorr",
.$Sample_Type == 3 ~ "Primary Blood Derived Cancer - Peripheral Blood",
.$Sample_Type == 5 ~ "Additional - New Primary",
.$Sample_Type == 6 ~ "Metastatic",
.$Sample_Type == 7 ~ "Additional Metastatic",
.$Sample_Type == 11 ~ "Solid Tissue Normal"
))
Note: only include “primary” tumor samples for now; also, add additional column to store vial ID (to ease mapping between data sets).
mirna_sample_pt_df <- mirna_sample_df %>%
filter(Sample_Type %in% c(1, 3, 5)) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", ""))
Additionally, the following disease types are included:
mirna_sample_pt_df %>%
group_by(Disease) %>%
tally()
Note: exclude LAML, THYM, and DLBC from cell content correlations
And technical variables:
mirna_sample_pt_df %>%
group_by(Protocol, Platform) %>%
tally()
Expression values in the data are reportedly reads per million (RPM). I’ve randomly selected a few samples from each disease type, sample type, protocol, and platform to inspect the distribution of expression values across all 743 miRNA genes.
set.seed(0)
# randomly select 1-3 samples from each combination of characteristics
sample_sub_df <- mirna_sample_pt_df %>%
group_by(Disease, Sample_Type, Protocol, Platform) %>%
sample_n(3, replace = TRUE) %>%
ungroup() %>%
distinct()
# subset and melt the expression data
mirna_sub_df <- mirna_corr_df %>%
select(one_of(c("Genes", sample_sub_df$id))) %>%
gather("sample", "expression", -Genes)
Even with a shifted log (log10(x + 1)) transformation, expression values look to be more exponentially distributed than normal.
mirna_sub_df %>%
left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>%
ggplot(aes(x = log10(expression + 1))) +
stat_density(aes(group = sample), geom = "line", position = "identity",
alpha = 0.2)
I can also look at the distribution of log-RPM values with boxplots:
mirna_sub_df %>%
left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>%
ggplot(aes(x = sample, y = log10(expression + 1))) +
geom_boxplot(aes(fill = Disease), outlier.size = 0.5) +
theme(axis.text.x = element_blank()) +
scale_fill_manual(values = tcga_colors$Color)
# convert expression df to matrix
mirna_corr_mat <- mirna_corr_df %>%
select(one_of(c("Genes", mirna_sample_pt_df$id))) %>%
column_to_rownames("Genes") %>%
as.matrix()
I used the prcomp() function on the transposed expression matrix (samples x genes, after transposing) to compute PCA data, which I can use to look for batch effects among samples.
mirna_corr_pca <- mirna_corr_mat %>%
t() %>%
prcomp()
(I can use the broom:tidy() function to convert data in the prcomp object into data frames for ggplot.)
pc_df <- mirna_corr_pca %>%
tidy("pcs")
mirna_corr_pca_df <- mirna_corr_pca %>%
tidy("samples") %>%
filter(PC <= 2) %>%
left_join(mirna_sample_pt_df, by = c("row" = "id"))
rm(mirna_corr_mat)
The plot below shows samples plotted as points along the first two principle components (PCs). Points are colored by disease type.
pc1_label <- pc_df %>%
filter(PC == 1) %>%
transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>%
flatten_chr()
pc2_label <- pc_df %>%
filter(PC == 2) %>%
transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>%
flatten_chr()
mirna_corr_pca_df %>%
spread(PC, value) %>%
ggplot(aes(x = `1`, y = `2`)) +
geom_point(aes(colour = Disease)) +
xlab(pc1_label) +
ylab(pc2_label) +
scale_color_manual(values = tcga_colors$Color)
Facetting by sample type…
mirna_corr_pca_df %>%
spread(PC, value) %>%
ggplot(aes(x = `1`, y = `2`)) +
geom_point(aes(colour = Disease)) +
xlab(pc1_label) +
ylab(pc2_label) +
scale_color_manual(values = tcga_colors$Color) +
facet_wrap(~ Sample_Type)
Cellular Content: syn7994728 syn5808205 (“TCGA_all_leuk_estimate.masked.20170107.tsv”)
# file_data <- synGet("syn5808205", downloadLocation = "./")
leuk_frac_file <- "../data/tcga/TCGA_all_leuk_estimate.masked.20170107.tsv"
leuk_frac_df <- read_tsv(leuk_frac_file, col_names = FALSE) %>%
set_names(c("disease", "id", "leuk_frac"))
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_double()
)
leuk_frac_df <- leuk_frac_df %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and leukocyte fraction.
leuk_frac_ids <- leuk_frac_df %>%
select(id) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
mirna_ids <- mirna_sample_pt_df %>%
select(id, vial_id) %>%
arrange()
mirna_leuk_frac_shared_ids <- inner_join(mirna_ids, leuk_frac_ids,
by = "vial_id",
suffix = c("_mirna", "_leuk_frac"))
# only keep samples with matched vial ID AND portion number
portion_id_minus_analyte_regex <- "([:alnum:]+\\-){4}[0-9]+"
# plate_id_only_regex <- "[:alnum:]+(?=([:alnum:]\\-[:alnum:]+){1}$)"
mirna_leuk_frac_shared_ids <- mirna_leuk_frac_shared_ids %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_leuk_frac, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the leukocyte fraction across these before computing correlations with miRNA
leuk_frac_corr_file <- "../data/leuk_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(leuk_frac_corr_file) | force_format) {
leuk_frac_corr_df <- mirna_leuk_frac_shared_ids %>%
left_join(leuk_frac_df, by = c("id_leuk_frac" = "id")) %>%
group_by(disease, vial_id) %>%
summarise(leuk_frac = mean(leuk_frac)) %>%
ungroup()
write_feather(leuk_frac_corr_df, leuk_frac_corr_file)
} else {
leuk_frac_corr_df <- read_feather(leuk_frac_corr_file)
}
mirna_leuk_frac_corr_file <- "../results/mirna_leuk_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_leuk_frac_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% leuk_frac_corr_df$disease) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(leuk_frac_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- leuk_frac_corr_df %>%
filter(vial_id %in% samples_d) %>%
dplyr::rename(sample = vial_id, x = leuk_frac) %>%
mutate(correlate = "leuk_frac")
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_leuk_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "leukocyte fraction")
write_feather(mirna_leuk_frac_corr_df, mirna_leuk_frac_corr_file)
} else {
mirna_leuk_frac_corr_df <- read_feather(mirna_leuk_frac_corr_file)
}
Cellular Content: syn4991611 syn8024565 (“TCGA.cluster-by-CIBERSORT-relative.tsv”)
or…?
syn7337221 (“TCGA.Kallisto.fullIDs.cibersort.relative.tsv”)
# ciber_frac_file <- "../data/tcga/TCGA.cluster-by-CIBERSORT-relative.tsv"
ciber_frac_file <- "../data/tcga/TCGA.Kallisto.fullIDs.cibersort.relative.tsv"
ciber_frac_df <- read_tsv(ciber_frac_file)
Parsed with column specification:
cols(
.default = col_double(),
SampleID = col_character(),
CancerType = col_character()
)
See spec(...) for full column specifications.
ciber_frac_df <- ciber_frac_df %>%
mutate(id = str_replace_all(SampleID, "\\.", "\\-")) %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and CIBERSORT fraction.
ciber_frac_ids <- ciber_frac_df %>%
select(id) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
# only keep samples with matched vial ID AND portion number
mirna_ciber_frac_shared_ids <- inner_join(mirna_ids, ciber_frac_ids,
by = "vial_id",
suffix = c("_mirna", "_ciber_frac")) %>%
distinct() %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_ciber_frac, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the CIBERSORT fraction across these before computing correlations with miRNA
ciber_frac_corr_file <- "../data/ciber_frac_correlates_for_mirna.feather"
force_format <- TRUE
if (!file.exists(ciber_frac_corr_file) | force_format) {
ciber_frac_corr_df <- mirna_ciber_frac_shared_ids %>%
left_join(ciber_frac_df, by = c("id_ciber_frac" = "id")) %>%
select(-id_mirna, -id_ciber_frac,
-SampleID, -P.value, -Correlation, -RMSE) %>%
group_by(CancerType, vial_id) %>%
summarise_each(funs(mean)) %>%
ungroup()
write_feather(ciber_frac_corr_df, ciber_frac_corr_file)
} else {
ciber_frac_corr_df <- read_feather(ciber_frac_corr_file)
}
mirna_ciber_frac_corr_file <- "../results/mirna_ciber_frac_correlation.feather"
force_compute <- TRUE
if (!file.exists(mirna_ciber_frac_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% ciber_frac_corr_df$CancerType) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(ciber_frac_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- ciber_frac_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-CancerType) %>%
gather(correlate, x, -vial_id) %>%
dplyr::rename(sample = vial_id)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_ciber_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "CIBERSORT fraction")
write_feather(mirna_ciber_frac_corr_df, mirna_ciber_frac_corr_file)
} else {
mirna_ciber_frac_corr_df <- read_feather(mirna_ciber_frac_corr_file)
}
Mutation Load: syn5706827 syn7994728 (“mutation-load”)
mut_load_file <- "../data/tcga/mutation-load-vial.txt"
mut_load_df <- read_tsv(mut_load_file)
Parsed with column specification:
cols(
cohort = col_character(),
Patient_ID = col_character(),
Tumor_Sample_ID = col_character(),
Tumor_Sample_ID_Vial = col_character(),
`Silent per Mb` = col_double(),
`Non-silent per Mb` = col_double()
)
Can’t filter aliquots, as data only includes IDs specific to the level of vial
Identify matched samples between miRNA and mutational load.
mut_load_ids <- mut_load_df %>%
select(id = Tumor_Sample_ID_Vial) %>%
mutate(vial_id = id) %>%
arrange()
mirna_mut_load_shared_ids <- inner_join(mirna_ids, mut_load_ids,
by = "vial_id",
suffix = c("_mirna", "_mut_load"))
mut_load_corr_file <- "../data/mut_load_correlates_for_mirna.feather"
force_format <- TRUE
if (!file.exists(mut_load_corr_file) | force_format) {
mut_load_corr_df <- mirna_mut_load_shared_ids %>%
left_join(mut_load_df, by = c("id_mut_load" = "Tumor_Sample_ID_Vial"))
write_feather(mut_load_corr_df, mut_load_corr_file)
} else {
mut_load_corr_df <- read_feather(mut_load_corr_file)
}
mirna_mut_load_corr_file <- "../results/mirna_mut_load_correlation.feather"
force_compute <- TRUE
if (!file.exists(mirna_mut_load_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% mut_load_corr_df$cohort) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(mut_load_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- mut_load_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-id_mirna, -id_mut_load, -cohort, -Patient_ID,
-Tumor_Sample_ID) %>%
gather(correlate, x, -vial_id) %>%
dplyr::rename(sample = vial_id)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_mut_load_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "mutation load")
write_feather(mirna_mut_load_corr_df, mirna_mut_load_corr_file)
} else {
mirna_mut_load_corr_df <- read_feather(mirna_mut_load_corr_file)
}
T Cell Receptor / Brown et al: syn5876488 syn7063422 (“mitcr_sampleStatistics_20160714.tsv”)
tcr_div_file <- "../data/tcga/mitcr_sampleStatistics_20160714.tsv"
tcr_div_df <- read_tsv(tcr_div_file)
Parsed with column specification:
cols(
AliquotBarcode = col_character(),
Study = col_character(),
SampleTypeLetterCode = col_character(),
ParticipantBarcode = col_character(),
SampleBarcode = col_character(),
CGHub_analysis_id = col_character(),
totTCR_reads = col_integer(),
totTCRa_reads = col_integer(),
totTCRb_reads = col_integer(),
shannon = col_double(),
numClones = col_integer()
)
tcr_dv_df <- tcr_div_df %>%
filter(!(AliquotBarcode %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and CIBERSORT fraction.
tcr_div_ids <- tcr_div_df %>%
select(id = AliquotBarcode, vial_id = SampleBarcode) %>%
arrange()
# only keep samples with matched vial ID AND portion number
mirna_tcr_div_shared_ids <- inner_join(mirna_ids, tcr_div_ids,
by = "vial_id",
suffix = c("_mirna", "_tcr_div")) %>%
distinct() %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_tcr_div, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the TCR diversity across these before computing correlations with miRNA
Some samples (AliquotBarcode) have 2 values for Shannon entropy, apparently corresponding to separate analyses on CGHub. I’ll go ahead and average these values per aliquot, prior to averaging Shannon values per vial ID. As a small measure of QC, I’ll also only keep observations with at least 1 TCRA read or 1 TCRB read.
tcr_div_corr_file <- "../data/tcr_div_correlates_for_mirna.feather"
force_format <- TRUE
if (!file.exists(tcr_div_corr_file) | force_format) {
tcr_div_corr_df <- mirna_tcr_div_shared_ids %>%
left_join(tcr_div_df, by = c("id_tcr_div" = "AliquotBarcode")) %>%
select(-CGHub_analysis_id) %>%
distinct() %>%
filter(totTCRa_reads > 0 | totTCRb_reads > 0) %>%
group_by(Study, vial_id, id_tcr_div) %>%
summarize(shannon = mean(shannon)) %>%
group_by(Study, vial_id) %>%
summarise(shannon = mean(shannon)) %>%
ungroup()
write_feather(tcr_div_corr_df, tcr_div_corr_file)
} else {
tcr_div_corr_df <- read_feather(tcr_div_corr_file)
}
mirna_tcr_div_corr_file <- "../results/mirna_tcr_div_correlation.feather"
force_compute <- TRUE
if (!file.exists(mirna_tcr_div_corr_file) | force_compute) {
# skip immune cell cancers
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% tcr_div_corr_df$Study) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(tcr_div_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- tcr_div_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-Study) %>%
dplyr::rename(sample = vial_id) %>%
gather(correlate, x, -sample)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_tcr_div_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "TCR diversity")
write_feather(mirna_tcr_div_corr_df, mirna_tcr_div_corr_file)
} else {
mirna_tcr_div_corr_df <- read_feather(mirna_tcr_div_corr_file)
}
Batch effects normalized mRNA data: syn4976363 syn4976369 (“EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv”) syn4976366 (" EB++GeneExpAnnotation.tsv“)
Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn4976366) and can be loaded with read_tsv().
# load sample data
mrna_sample_file <- mrna_files %>%
filter(file.id == "syn4976366") %>%
.[["file_path"]]
mrna_sample_df <- read_tsv(mrna_sample_file)
Parsed with column specification:
cols(
SampleID = col_character(),
Center = col_character(),
platform = col_character(),
Adjustment = col_character()
)
Remove samples from mRNA dataset.
mrna_sample_df <- mrna_sample_df %>%
filter(!(SampleID %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and mRNA.
mrna_ids <- mrna_sample_df %>%
select(SampleID) %>%
mutate(vial_id = str_replace(SampleID, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
mirna_mrna_shared_ids <- inner_join(mirna_ids, mrna_ids, by = "vial_id")
# only keep samples with matched vial ID AND portion number
mirna_mrna_shared_ids <- mirna_mrna_shared_ids %>%
filter(str_extract(id, portion_id_minus_analyte_regex)
== str_extract(SampleID, portion_id_minus_analyte_regex))
mRNA normalized, batch corrected expression values for all samples are stored as a matrix in a TSV file (Synapse ID: syn4976369) and can be loaded with read_tsv().
List of genes accessed here and saved as a TSV at data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv:
gene_correlate_file <- "../data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv"
gene_correlate_df <- read_tsv(gene_correlate_file)
Missing column names filled in: 'X8' [8], 'X9' [9]Parsed with column specification:
cols(
Gene = col_character(),
`HGNC Symbol` = col_character(),
`Entrez ID` = col_integer(),
`Gene Family` = col_character(),
`Immune Checkpoint` = col_character(),
Function = col_character(),
`Reference(s) [PMID]` = col_character(),
X8 = col_character(),
X9 = col_character()
)
mrna_corr_file <- "../data/mrna_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mrna_corr_file) | force_format) {
# load normalized, batch-corrected expression data
mrna_norm_file <- mrna_files %>%
filter(file.id == "syn4976369") %>%
.[["file_path"]]
mrna_norm_df <- read_tsv(mrna_norm_file, progress = FALSE)
# gene_list <- c("IFNG", "PRF1", "GZMA", "PDCD1", "CD274", "PDCD1LG2", "IL10",
# "TGFB1", "IDO1", "HLA-A")
mrna_corr_df <- mrna_norm_df %>%
separate(gene_id, c("gene_name", "gene_id"), sep = "\\|") %>%
filter((gene_id %in% gene_correlate_df$`Entrez ID`)
| (gene_name %in% gene_correlate_df$`HGNC Symbol`)) %>%
select(one_of(c("gene_name", "gene_id",
mirna_mrna_shared_ids$SampleID)))
write_feather(mrna_corr_df, mrna_corr_file)
rm(mrna_norm_df)
} else {
mrna_corr_df <- read_feather(mrna_corr_file)
}
mirna_mrna_corr_file <- "../results/mirna_mrna_correlation.feather"
force_compute <- TRUE
if (!file.exists(mirna_mrna_corr_file) | force_compute) {
d_list <- mirna_sample_pt_df %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(str_replace(names(mrna_corr_df),
"(\\-[:alnum:]+){3}$", ""))
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- mrna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("gene_name", samples_d))) %>%
dplyr::rename(correlate = gene_name) %>%
gather(sample, x, -correlate) %>%
filter(!is.na(x))
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_mrna_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "mRNA expression")
write_feather(mirna_mrna_corr_df, mirna_mrna_corr_file)
} else {
mirna_mrna_corr_df <- read_feather(mirna_mrna_corr_file)
}
bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
mirna_mrna_corr_df) %>%
group_by(disease, correlate_type) %>%
summarise(num_correlations = length(estimate),
significant = sum(p.adjust < 0.05, na.rm = TRUE),
other = num_correlations - significant) %>%
gather(correlations, total, -disease, -correlate_type, -num_correlations) %>%
ggplot(aes(x = disease, y = total)) +
geom_col(aes(fill = disease, alpha = correlations), colour = "slategray") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(angle = 45),
legend.position = "top") +
scale_fill_manual(values = tcga_colors$Color) +
scale_alpha_manual(values = c(0.4, 1)) +
guides(fill = FALSE) +
facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
mirna_sig_corr_df <- bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
mirna_mrna_corr_df) %>%
filter(p.adjust < 0.05) %>%
mutate(disease = factor(disease, levels = tcga_colors$Disease))
mirna_sig_corr_df %>%
ggplot(aes(x = disease, y = estimate)) +
# geom_violin(aes(fill = disease)) +
geom_quasirandom(aes(fill = disease, colour = disease), size = 0.5,
alpha = 0.3, shape = 21) +
ylab("miRNA-correlate correlation estimate (spearman)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = tcga_colors$Color) +
scale_colour_manual(values = tcga_colors$Color) +
guides(fill = FALSE, colour = FALSE) +
facet_wrap(~ correlate_type, ncol = 2, scales = "free_x")
mirna_sig_corr_df %>%
mutate(direction = ifelse(estimate > 0, "positive", "negative")) %>%
group_by(disease, correlate_type, direction) %>%
tally() %>%
ungroup() %>%
mutate(count = ifelse(direction == "negative", -1 * n, n)) %>%
ggplot(aes(x = disease, y = count)) +
geom_col(aes(fill = disease, alpha = direction), colour = "slategray") +
# geom_violin(aes(fill = disease)) +
# geom_quasirandom(aes(fill = disease, colour = disease), size = 0.5,
# alpha = 0.3, shape = 21) +
# ylab("miRNA-correlate correlation estimate (spearman)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_fill_manual(values = tcga_colors$Color) +
scale_alpha_manual(values = c(0.4, 1)) +
guides(fill = FALSE) +
facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
mirna_all_corr_df <- bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
mirna_mrna_corr_df) %>%
filter(!is.na(p.adjust)) %>%
group_by(mirna, disease, correlate_type) %>%
mutate(num_sig = sum(p.adjust < 0.05 & abs(estimate) > 0.5)) %>%
group_by(mirna, disease) %>%
filter(sum(num_sig > 0) > 2) %>%
ungroup()
mirna_all_corr_df %>%
group_by(disease) %>%
tally()
mirna_mrna_corr_df %>%
filter((!disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(!is.na(p.adjust),
p.adjust < 0.05,
abs(estimate) > 0.7) %>%
left_join(gene_correlate_df %>%
mutate(`HGNC Symbol` = ifelse(Gene == "TNFA",
"TNF", `HGNC Symbol`),
`HGNC Symbol` = ifelse(Gene == "IL12",
"IL12A", `HGNC Symbol`),
`HGNC Symbol` = ifelse(`HGNC Symbol` == "VISTA",
"C10orf54", `HGNC Symbol`)),
by = c("correlate" = "HGNC Symbol")) %>%
mutate(correlate = ifelse(is.na(`Immune Checkpoint`) & str_detect(Function, "(MHC|Granzyme)"),
Function, str_c("Checkpoint", `Immune Checkpoint`, "Gene")),
correlate = ifelse(is.na(correlate), `Gene Family`, correlate)) %>%
select(disease, mirna, correlate, estimate, statistic, p.value, method, alternative, p.adjust, correlate_type) %>%
bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df, .) %>%
filter((!disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(!is.na(p.adjust),
p.adjust < 0.05,
abs(estimate) > 0.7) %>%
mutate(correlate = fct_inorder(correlate)) %>%
# filter(correlate_type == "mRNA expression") %>%
# distinct(correlate) %>%
group_by(disease, mirna, correlate, correlate_type) %>%
summarise(estimate = mean(estimate)) %>%
ungroup() %>%
mutate(direction = ifelse(estimate > 0, "positive", "negative")) %>%
group_by(mirna, correlate, direction) %>%
summarise(diseases = n_distinct(disease)) %>%
group_by(mirna) %>%
mutate(nz_diseases = sum(diseases > 0)) %>%
ungroup() %>%
filter(nz_diseases > 1) %>%
arrange(desc(nz_diseases)) %>%
mutate(mirna = fct_inorder(mirna)) %>%
I %>%
ggplot(aes(y = correlate, x = mirna)) +
geom_point(aes(fill = direction, size = diseases), shape = 21, alpha = 0.8, colour = "black") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))